### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### Helper file with common functions.

library(data.table)
library(ggplot2)
library(zoo)
library(Hmisc)
library(stringr)


# Turn on for replication package
REPLICATION = T



loadData <- function(arms.length.only = T,SAMPLE=REPLICATION) {
  # Main data loader / prepper function
  if(!SAMPLE) {
    data.in <- fread('../data/processed/corelogic/Combined_processed.csv')
  } else {
    data.in <- fread('../data/processed/corelogic/share/Combined_processed.csv')
  }
  
  if(arms.length.only) {
    data.in <- data.in[propertytype == '10' & transactiontype == '1']
  } else {
    data.in <- data.in[propertytype == '10']
  }
  
  # Fix dates
  data.in[,date := as.Date(date)]
  data.in[,seller.buy.date := as.Date(seller.buy.date)]
  
  # Create some fields
  data.in[,zip5 := as.numeric(substr(situszipcode,1,5))]
  data.in[,qtr := as.yearqtr(date)]
  
  # Merge with census with geographical characteristics
  census.in <- loadCensusData()
  census.in[,income.bin := cut(median.household.income,breaks = c(0,30000,50000,70000,90000,110000,1e999),labels = c('<30k','30-50k','50-70k','70-90k','90-110k','110k+'))]
  census.in[,income.bin := factor(income.bin,levels(income.bin)[c(6,1:5)])]
  
  census.in[,college.bin := cut(pct.bachelors,breaks = c(0,.15,.30,.45,.60,1),labels=c('<15%','15-30%','30-45%','45-60%','60%+'))]
  census.in[,college.bin := factor(college.bin,levels(college.bin)[c(5,1:4)])]
  
  census.in[,median.age.bin := cut(median.age ,breaks = c(0,30,40,50,60,200),labels=c('<30','30-40','40-50','50-60','60+'))]
  census.in[,median.age.bin := factor(median.age.bin,levels(median.age.bin)[c(5,1:4)])]
  
  data.in <- merge(data.in,census.in,by.x='zip5',by.y='zip5')
  
  data.in[,living_sqft := as.numeric(living_sqft)]
  data.in[,land_sqft := as.numeric(landsquarefootage)]  
  data.in[,house.age := year - as.numeric(yearbuilt)]
  
  
  data.in <- data.in[is.na(saleamount) | saleamount < 10000000 & land_sqft < 50000]
  
  # Create bins on house characteristics
  data.in[,price.bin := cut(saleamount,breaks = c(0,100000,250000,500000,1000000,1e999))]
  data.in[,price.bin := factor(price.bin,levels(price.bin)[c(5,1:4)])]
  
  data.in[,age.bin := cut(house.age,breaks = c(0,5,15,50,1000000))]
  data.in[,age.bin := factor(age.bin,levels(age.bin)[c(4,1:3)])]
  
  data.in[,size.bin := cut(land_sqft,breaks = c(0,5000,10000,25000,1000000))]
  data.in[,size.bin := factor(size.bin,levels(size.bin)[c(4,1:3)])]
  
  data.in[,multistory := as.integer(storiesnum > 1)]
  
  data.in[,seller.ltv.bin := cut(seller.ltv,breaks = c(0,.25,.5,.75,.80,.85,.9,.95,100),labels = c('0-0.25','0.25-0.50','0.50-0.75','0.75-0.80','0.80-0.85','0.85-0.90','0.90-0.95','0.95-1.00'))]
  
  
  # Variable called ibuyer, which is just buyer OR seller
  data.in[,iBuyer := as.integer(iBuyer.buyer==1 | iBuyer.seller==1)]
  
  
  # Tenure
  data.in[,seller.tenure := as.numeric(date-seller.buy.date)]
  
  
  return(data.in)
  
  
  
}



loadCensusData <- function() {
  census.in <- fread('../data/raw/census/nhgis0023_ds201_20135_2013_zcta.csv')
  census.in[,zip5 := ZCTA5A]
  
  
  # Topics
  # UEF: age by sex
  # UEQ: Race
  # UEZ: Hispanic or latino
  # UFF: Means of transport
  # UFH: Travel time
  # UGS: Educational attainment
  # UHD: Median household income
  # UJ8: Employment status
  # UMF: Rent as % of household income
  
  # Age by sex --> just keep total.
  data.temp <- census.in[,c('zip5','UEFE001'),with=F]
  names(data.temp) <- c('zip5','median.age')
  data.working <- data.temp
  
  # Race --> get total population and % white.
  data.temp <- census.in[,c('zip5','UEQE001','UEQE002'),with=F]
  data.temp[,pop.total := UEQE001]
  data.temp[,pct.white := UEQE002 / UEQE001]
  data.temp <- data.temp[,c('zip5','pop.total','pct.white'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Hispanic or latino -> get % latino
  data.temp <- census.in[,c('zip5','UEZE001','UEZE003'),with=F]
  data.temp[,pct.hispanic := UEZE003 / UEZE001]
  data.temp <- data.temp[,c('zip5','pct.hispanic'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')  
  
  # Transportation -> Get % car, % public, % walk or bike
  data.temp <- census.in[,c('zip5','UFFE001','UFFE002','UFFE010','UFFE018','UFFE019'),with=F]
  data.temp[,pct.commute.car := UFFE002 / UFFE001]
  data.temp[,pct.commute.public := UFFE010 / UFFE001]
  data.temp[,pct.commute.bike.walk := (UFFE018 + UFFE019) / UFFE001]
  data.temp <- data.temp[,c('zip5','pct.commute.car','pct.commute.public','pct.commute.bike.walk'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Travel time -> Get % with commute above 60 minutes
  data.temp <- census.in[,c('zip5','UFHE001','UFHE012','UFHE013'),with=F]
  data.temp[,pct.commute.above.60 := (UFHE012 + UFHE013) / UFHE001]
  data.temp <- data.temp[,c('zip5','pct.commute.above.60')]
  data.working <- merge(data.working,data.temp,by='zip5')  
  
  # Educational attainment -> Get % with bachelors or above
  data.temp <- census.in[,c('zip5','UGSE001','UGSE022','UGSE023','UGSE024','UGSE025'),with=F]
  data.temp[,pct.bachelors := (UGSE022 + UGSE023 + UGSE024 + UGSE025) / UGSE001]
  data.temp <- data.temp[,c('zip5','pct.bachelors'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Median household income --> Just use median household income.
  data.temp <- census.in[,c('zip5','UHDE001'),with=F]
  names(data.temp) <- c('zip5','median.household.income')
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Unemployment -> Use percentage of unemployed in civilian labor force
  data.temp <- census.in[,c('zip5','UJ8E003','UJ8E005'),with=F]
  data.temp[,pct.unemployed := UJ8E005 / UJ8E003]
  data.temp <- data.temp[,c('zip5','pct.unemployed'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Rental cost as fraction of income --> get percent paying over 50% of income
  data.temp <- census.in[,c('zip5','UMFE001','UMFE008','UMFE009','UMFE010'),with=F]
  data.temp[,pct.rent.over.50 := ( UMFE010) / UMFE001]
  data.temp <- data.temp[,c('zip5','pct.rent.over.50'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Return the result
  return(data.working)
}


